{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pdfplumber\n",
    "import pandas as pd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "pdf = pdfplumber.open(R\"data/比赛名单.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "table = []\n",
    "for i in range(len(pdf.pages)):\n",
    "    page = pdf.pages[i]\n",
    "    table.extend(page.extract_table())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df=pd.DataFrame(table,columns=(\"学校\",\"项目名称\",\"大类\",\"参赛学生\",\"指导教师\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "学校          兰州信息科技学院\n",
       "项目名称     基于疫情下的宠物云管家\n",
       "大类           软件应用与开发\n",
       "参赛学生    刘纪尧  焦守武  林聪\n",
       "指导教师        陈文娟  马生菊\n",
       "Name: 99, dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[99]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1=df.index.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = df['学校'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>学校</th>\n",
       "      <th>项目名称</th>\n",
       "      <th>大类</th>\n",
       "      <th>参赛学生</th>\n",
       "      <th>指导教师</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>409</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《十二时辰》</td>\n",
       "      <td>微课与教学辅助</td>\n",
       "      <td>张元飞  蒋瑶昕  刘若琪</td>\n",
       "      <td>张辉刚</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>410</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《悯农（其二）》</td>\n",
       "      <td>微课与教学辅助</td>\n",
       "      <td>冼慧玲  唐万福  冯金凤</td>\n",
       "      <td>李肖霞  祁芸</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>411</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《卖火柴的小女孩》</td>\n",
       "      <td>微课与教学辅助</td>\n",
       "      <td>汪思黔  曹紫悦  吴会琴</td>\n",
       "      <td>朱万侠</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>412</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《文化的记忆：汉字历史数字化交互馆》</td>\n",
       "      <td>数媒游戏与交互设计专业组</td>\n",
       "      <td>陈王兴  刘睿悦  刘雅婷</td>\n",
       "      <td>刘洋  张志腾</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>413</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《汉衍书意-汉字发展历史数字化交互云展厅》</td>\n",
       "      <td>数媒游戏与交互设计专业组</td>\n",
       "      <td>杨娟  钱鑫  廖美琳  白舒怡  李林书</td>\n",
       "      <td>张志腾  冷明伟</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>414</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《声声慢》</td>\n",
       "      <td>数媒游戏与交互设计专业组</td>\n",
       "      <td>张紫淇  鲁淏文  陈丹芬</td>\n",
       "      <td>杨志宏</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>415</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《说文解字》</td>\n",
       "      <td>数媒静态设计（普通组）</td>\n",
       "      <td>韦霏霏  官琳琳</td>\n",
       "      <td>崔永鹏  刘舒雯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>416</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《中华传统文化五行之属》</td>\n",
       "      <td>数媒静态设计专业组</td>\n",
       "      <td>文宣  徐慧子  庞靖</td>\n",
       "      <td>李文丽  李瑾</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>417</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《汉字的魅力—四季之美》</td>\n",
       "      <td>数媒静态设计专业组</td>\n",
       "      <td>万晋妤  彭键</td>\n",
       "      <td>李文丽  李瑾</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>418</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《汉字溢彩传经典，一撇一捺总关情》</td>\n",
       "      <td>数媒静态设计专业组</td>\n",
       "      <td>艾力飞热·艾尼玩  西仁娜依·阿布来克  仲格尔吉</td>\n",
       "      <td>刘舒雯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>419</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《为霞尚满天》</td>\n",
       "      <td>数媒动漫与短片（普通组）</td>\n",
       "      <td>马尚凯  张骞文  李媛钰  何波  林琳</td>\n",
       "      <td>张辉刚  石静</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>420</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《石榴花开的声音》</td>\n",
       "      <td>数媒动漫与短片专业组</td>\n",
       "      <td>王宝国  林焮婷  马念  张衍鹏  高巧灵</td>\n",
       "      <td>张辉刚</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>421</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《马家窑文化彩陶信息可视化设计》</td>\n",
       "      <td>信息可视化设计</td>\n",
       "      <td>张金星  赵紫琪</td>\n",
       "      <td>李文丽  李君利</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>422</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《兰州野生动物园信息可视化》</td>\n",
       "      <td>信息可视化设计</td>\n",
       "      <td>王艺霏  乌日娜  陈强</td>\n",
       "      <td>李文丽  李瑾</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>423</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>防疫式新型冷库管理系统</td>\n",
       "      <td>物联网应用</td>\n",
       "      <td>廖云强  齐彦  杨宏宇</td>\n",
       "      <td>杨雪松</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>424</th>\n",
       "      <td>西北民族大学</td>\n",
       "      <td>《共享外卖自提箱》</td>\n",
       "      <td>物联网应用</td>\n",
       "      <td>周瑞婷  吕坤山  孙浩瑜</td>\n",
       "      <td>张国恒</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         学校                   项目名称            大类                       参赛学生  \\\n",
       "409  西北民族大学                 《十二时辰》       微课与教学辅助              张元飞  蒋瑶昕  刘若琪   \n",
       "410  西北民族大学               《悯农（其二）》       微课与教学辅助              冼慧玲  唐万福  冯金凤   \n",
       "411  西北民族大学              《卖火柴的小女孩》       微课与教学辅助              汪思黔  曹紫悦  吴会琴   \n",
       "412  西北民族大学     《文化的记忆：汉字历史数字化交互馆》  数媒游戏与交互设计专业组              陈王兴  刘睿悦  刘雅婷   \n",
       "413  西北民族大学  《汉衍书意-汉字发展历史数字化交互云展厅》  数媒游戏与交互设计专业组      杨娟  钱鑫  廖美琳  白舒怡  李林书   \n",
       "414  西北民族大学                  《声声慢》  数媒游戏与交互设计专业组              张紫淇  鲁淏文  陈丹芬   \n",
       "415  西北民族大学                 《说文解字》   数媒静态设计（普通组）                   韦霏霏  官琳琳   \n",
       "416  西北民族大学           《中华传统文化五行之属》     数媒静态设计专业组                文宣  徐慧子  庞靖   \n",
       "417  西北民族大学           《汉字的魅力—四季之美》     数媒静态设计专业组                    万晋妤  彭键   \n",
       "418  西北民族大学      《汉字溢彩传经典，一撇一捺总关情》     数媒静态设计专业组  艾力飞热·艾尼玩  西仁娜依·阿布来克  仲格尔吉   \n",
       "419  西北民族大学                《为霞尚满天》  数媒动漫与短片（普通组）      马尚凯  张骞文  李媛钰  何波  林琳   \n",
       "420  西北民族大学              《石榴花开的声音》    数媒动漫与短片专业组     王宝国  林焮婷  马念  张衍鹏  高巧灵   \n",
       "421  西北民族大学       《马家窑文化彩陶信息可视化设计》       信息可视化设计                   张金星  赵紫琪   \n",
       "422  西北民族大学         《兰州野生动物园信息可视化》       信息可视化设计               王艺霏  乌日娜  陈强   \n",
       "423  西北民族大学            防疫式新型冷库管理系统         物联网应用               廖云强  齐彦  杨宏宇   \n",
       "424  西北民族大学              《共享外卖自提箱》         物联网应用              周瑞婷  吕坤山  孙浩瑜   \n",
       "\n",
       "         指导教师  \n",
       "409       张辉刚  \n",
       "410   李肖霞  祁芸  \n",
       "411       朱万侠  \n",
       "412   刘洋  张志腾  \n",
       "413  张志腾  冷明伟  \n",
       "414       杨志宏  \n",
       "415  崔永鹏  刘舒雯  \n",
       "416   李文丽  李瑾  \n",
       "417   李文丽  李瑾  \n",
       "418       刘舒雯  \n",
       "419   张辉刚  石静  \n",
       "420       张辉刚  \n",
       "421  李文丽  李君利  \n",
       "422   李文丽  李瑾  \n",
       "423       杨雪松  \n",
       "424       张国恒  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.query('学校==\"西北民族大学\"')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_works_amount_by_school(df, school:str)->int:\n",
    "    return df.query(F'学校==\"{school}\"').shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>学校</th>\n",
       "      <th>项目名称</th>\n",
       "      <th>大类</th>\n",
       "      <th>参赛学生</th>\n",
       "      <th>指导教师</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [学校, 项目名称, 大类, 参赛学生, 指导教师]\n",
       "Index: []"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.query('学校==\"西北民族大学\"\"兰州大学\"\"西安交通科技大学\"')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_works_amount_by_school(df,'西北民族大学')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_works_amount_by_school(df,'兰州大学')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_works_amount_by_school(df,'西北师范大学')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_works_amount_by_school(df,'西安建筑科技大学')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.10 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "e5b51f9075b4cc1ea8d9810577a26807122690438b3a6e6e05129a402faed2ba"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
